In [1]:
pip install nltk matplotlib seaborn wordcloud
Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Requirement already satisfied: matplotlib in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (3.9.4)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting wordcloud
  Downloading wordcloud-1.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting click (from nltk)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Requirement already satisfied: joblib in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from nltk) (1.4.2)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
Requirement already satisfied: tqdm in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from nltk) (4.67.1)
Requirement already satisfied: contourpy>=1.0.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (4.55.3)
Requirement already satisfied: kiwisolver>=1.3.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (1.4.7)
Requirement already satisfied: numpy>=1.23 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (2.0.2)
Requirement already satisfied: packaging>=20.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (24.2)
Requirement already satisfied: pillow>=8 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (11.1.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (3.2.1)
Requirement already satisfied: python-dateutil>=2.7 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (2.9.0.post0)
Requirement already satisfied: importlib-resources>=3.2.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from matplotlib) (6.5.2)
Requirement already satisfied: pandas>=1.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from seaborn) (2.2.3)
Requirement already satisfied: zipp>=3.1.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from importlib-resources>=3.2.0->matplotlib) (3.21.0)
Requirement already satisfied: pytz>=2020.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib) (1.15.0)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Downloading wordcloud-1.9.4-cp39-cp39-macosx_11_0_arm64.whl (168 kB)
Downloading regex-2024.11.6-cp39-cp39-macosx_11_0_arm64.whl (284 kB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Installing collected packages: regex, click, nltk, wordcloud, seaborn
Successfully installed click-8.1.8 nltk-3.9.1 regex-2024.11.6 seaborn-0.13.2 wordcloud-1.9.4
Note: you may need to restart the kernel to use updated packages.
In [2]:
import os
import nltk
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud


%matplotlib inline


nltk.download('punkt')
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[2]:
True
In [3]:
def process_file(file_path):
    """
    Reads a text file, removes Gutenberg boilerplate (header and footer), tokenizes the text,
    converts tokens to lowercase, and filters out non-alphabetic tokens.
    
    Returns:
        tokens (list): A list of cleaned, alphabetic tokens.
    """
    
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    
   
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker   = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    
    
    start_idx = raw_text.find(start_marker)
    if start_idx != -1:
        
        text = raw_text[start_idx + len(start_marker):]
    else:
        text = raw_text  # If no marker found, use entire text
    
    
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]
    
   
    text = text.strip()
    
    
    tokens = nltk.word_tokenize(text.lower())
    
    tokens = [token for token in tokens if token.isalpha()]
    
    return tokens


test_file = os.path.join("Gutenberg_Books", "1.txt.txt")
print("Test file tokens (first 20):", process_file(test_file)[:20])
Test file tokens (first 20): ['start', 'of', 'the', 'project', 'gutenberg', 'ebook', 'note', 'this', 'file', 'combines', 'the', 'first', 'two', 'project', 'gutenberg', 'files', 'both', 'of', 'which', 'were']
In [4]:
folder = "Gutenberg_Books"


all_files = [f for f in os.listdir(folder) if f.endswith(".txt.txt")]
print(f"Found {len(all_files)} files.")


aggregated_counter = Counter()
file_stats = []  # This will hold stats for each file


for filename in all_files:
    file_path = os.path.join(folder, filename)
    tokens = process_file(file_path)
    
    aggregated_counter.update(tokens)
    
    file_stats.append({
        "filename": filename,
        "num_tokens": len(tokens),
        "unique_tokens": len(set(tokens))
    })


df_stats = pd.DataFrame(file_stats)
print("Per-file statistics (first 5 rows):")
display(df_stats.head())
Found 2475 files.
Per-file statistics (first 5 rows):
filename num_tokens unique_tokens
0 4658.txt.txt 161293 15819
1 37009.txt.txt 77551 4991
2 14609.txt.txt 89507 11561
3 5342.txt.txt 87301 6824
4 17.txt.txt 268340 5539
In [5]:
most_common_all = aggregated_counter.most_common(20)
print("Aggregated Top 20 words:")
print(most_common_all)


words, counts = zip(*most_common_all)


plt.figure(figsize=(10, 6))
sns.barplot(x=list(counts), y=list(words), palette="viridis")
plt.title("Aggregated Top 20 Most Common Words")
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.tight_layout()
plt.show()
Aggregated Top 20 words:
[('the', 13395724), ('of', 7642031), ('and', 6378760), ('to', 4934479), ('a', 3908621), ('in', 3857791), ('that', 2092342), ('is', 1800929), ('it', 1767951), ('was', 1723232), ('i', 1623000), ('he', 1620363), ('with', 1496503), ('as', 1448598), ('for', 1433725), ('his', 1393159), ('by', 1266249), ('on', 1171581), ('be', 1141795), ('not', 1094746)]
/var/folders/7j/rv3w77nj6kb6kw_ssltcqpkr0000gp/T/ipykernel_22400/2636935665.py:11: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=list(counts), y=list(words), palette="viridis")
No description has been provided for this image
In [6]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(aggregated_counter)


plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Aggregated Word Cloud from All Gutenberg Books")
plt.show()
No description has been provided for this image
In [7]:
df_stats['lexical_diversity'] = df_stats['unique_tokens'] / df_stats['num_tokens']


display(df_stats.head())


plt.figure(figsize=(10, 6))
sns.histplot(df_stats['lexical_diversity'], kde=True, bins=20, color='skyblue')
plt.title("Distribution of Lexical Diversity Across Gutenberg Books")
plt.xlabel("Lexical Diversity (Unique Tokens / Total Tokens)")
plt.ylabel("Number of Books")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 4))
sns.boxplot(x=df_stats['lexical_diversity'], color='lightgreen')
plt.title("Boxplot of Lexical Diversity Across Books")
plt.xlabel("Lexical Diversity")
plt.tight_layout()
plt.show()
filename num_tokens unique_tokens lexical_diversity
0 4658.txt.txt 161293 15819 0.098076
1 37009.txt.txt 77551 4991 0.064358
2 14609.txt.txt 89507 11561 0.129163
3 5342.txt.txt 87301 6824 0.078166
4 17.txt.txt 268340 5539 0.020642
No description has been provided for this image
No description has been provided for this image
In [8]:
all_tokens = []
for filename in all_files:
    file_path = os.path.join(folder, filename)
    tokens = process_file(file_path)
    all_tokens.extend(tokens)

print("Total tokens collected from all files:", len(all_tokens))
Total tokens collected from all files: 209085770
In [9]:
import nltk
nltk.download('averaged_perceptron_tagger')
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Out[9]:
True
In [10]:
print(nltk.data.path)
nltk.download('punkt')
['/Users/mmadhusudan/nltk_data', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/nltk_data', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/share/nltk_data', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[10]:
True
In [11]:
import nltk
nltk.download('averaged_perceptron_tagger')
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Out[11]:
True
In [12]:
print(nltk.data.find('taggers/averaged_perceptron_tagger'))
/Users/mmadhusudan/nltk_data/taggers/averaged_perceptron_tagger
In [13]:
import random

sample_size = min(5000, len(all_tokens))
sample_tokens = random.sample(all_tokens, sample_size)
In [14]:
file_path = os.path.join("Gutenberg_Books", "1.txt.txt")
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()


start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
end_marker   = "*** END OF THIS PROJECT GUTENBERG EBOOK"
start_idx = raw_text.find(start_marker)
if start_idx != -1:
    text = raw_text[start_idx + len(start_marker):]
else:
    text = raw_text

end_idx = text.find(end_marker)
if end_idx != -1:
    cleaned_text = text[:end_idx]
else:
    cleaned_text = text

cleaned_text = cleaned_text.strip()
In [15]:
# --- Sentence-Level Analysis ---


nltk.download('punkt')


sentences = nltk.sent_tokenize(cleaned_text)


sentence_lengths = [len(nltk.word_tokenize(sentence)) for sentence in sentences]


print("Number of sentences:", len(sentences))
print("Average sentence length (words):", sum(sentence_lengths) / len(sentence_lengths))


plt.figure(figsize=(12, 6))
sns.histplot(sentence_lengths, bins=30, kde=True, color='skyblue')
plt.title("Distribution of Sentence Lengths in Book 1")
plt.xlabel("Sentence Length (number of words)")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
Number of sentences: 833
Average sentence length (words): 27.613445378151262
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No description has been provided for this image
In [24]:
!pip install spacy
!python -m spacy download en_core_web_sm
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: spacy in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (3.8.3)
Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.11 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (3.0.12)
Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (1.0.5)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (1.0.12)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (2.0.11)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (3.0.9)
Requirement already satisfied: thinc<8.4.0,>=8.3.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (8.3.4)
Requirement already satisfied: wasabi<1.2.0,>=0.9.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (1.1.3)
Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (2.5.1)
Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (2.0.10)
Requirement already satisfied: weasel<0.5.0,>=0.1.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (0.4.1)
Requirement already satisfied: typer<1.0.0,>=0.3.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (0.15.1)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (4.67.1)
Requirement already satisfied: numpy>=1.19.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (2.0.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (2.32.3)
Requirement already satisfied: pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (2.10.6)
Requirement already satisfied: jinja2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (3.1.5)
Requirement already satisfied: setuptools in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from spacy) (58.0.4)
Requirement already satisfied: packaging>=20.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (24.2)
Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from spacy) (3.5.0)
Requirement already satisfied: language-data>=1.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from langcodes<4.0.0,>=3.2.0->spacy) (1.3.0)
Requirement already satisfied: annotated-types>=0.6.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (0.7.0)
Requirement already satisfied: pydantic-core==2.27.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (2.27.2)
Requirement already satisfied: typing-extensions>=4.12.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy) (4.12.2)
Requirement already satisfied: charset-normalizer<4,>=2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from requests<3.0.0,>=2.13.0->spacy) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2.3.0)
Requirement already satisfied: certifi>=2017.4.17 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from requests<3.0.0,>=2.13.0->spacy) (2024.12.14)
Requirement already satisfied: blis<1.3.0,>=1.2.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from thinc<8.4.0,>=8.3.0->spacy) (1.2.0)
Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from thinc<8.4.0,>=8.3.0->spacy) (0.1.5)
Requirement already satisfied: click>=8.0.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from typer<1.0.0,>=0.3.0->spacy) (8.1.8)
Requirement already satisfied: shellingham>=1.3.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from typer<1.0.0,>=0.3.0->spacy) (1.5.4)
Requirement already satisfied: rich>=10.11.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from typer<1.0.0,>=0.3.0->spacy) (13.9.4)
Requirement already satisfied: cloudpathlib<1.0.0,>=0.7.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from weasel<0.5.0,>=0.1.0->spacy) (0.20.0)
Requirement already satisfied: smart-open<8.0.0,>=5.2.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from weasel<0.5.0,>=0.1.0->spacy) (7.1.0)
Requirement already satisfied: MarkupSafe>=2.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from jinja2->spacy) (3.0.2)
Requirement already satisfied: marisa-trie>=1.1.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from language-data>=1.2->langcodes<4.0.0,>=3.2.0->spacy) (1.2.1)
Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (3.0.0)
Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (2.19.1)
Requirement already satisfied: wrapt in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from smart-open<8.0.0,>=5.2.1->weasel<0.5.0,>=0.1.0->spacy) (1.17.2)
Requirement already satisfied: mdurl~=0.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from markdown-it-py>=2.2.0->rich>=10.11.0->typer<1.0.0,>=0.3.0->spacy) (0.1.2)
zsh:1: command not found: python
In [25]:
!python3 -m spacy download en_core_web_sm
/Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
  warnings.warn(
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')
In [26]:
import spacy
from collections import Counter


nlp = spacy.load("en_core_web_sm")


doc = nlp(cleaned_text)


entities = [(ent.text, ent.label_) for ent in doc.ents]


entity_counts = Counter([ent.label_ for ent in doc.ents])


df_entities = pd.DataFrame(entity_counts.items(), columns=['Entity Type', 'Count']).sort_values(by='Count', ascending=False)

print("Named Entity counts:")
display(df_entities)


plt.figure(figsize=(10, 6))
sns.barplot(data=df_entities.head(10), x='Count', y='Entity Type', palette="magma")
plt.title("Top 10 Named Entity Types in Book 1")
plt.xlabel("Frequency")
plt.ylabel("Entity Type")
plt.tight_layout()
plt.show()
Named Entity counts:
Entity Type Count
4 ORG 429
5 GPE 205
3 PERSON 197
2 DATE 145
1 CARDINAL 114
6 LAW 67
8 WORK_OF_ART 56
0 ORDINAL 33
7 NORP 31
14 PRODUCT 18
15 LOC 16
9 MONEY 14
13 EVENT 8
12 FAC 7
16 TIME 6
11 PERCENT 5
10 QUANTITY 4
17 LANGUAGE 1
/var/folders/7j/rv3w77nj6kb6kw_ssltcqpkr0000gp/T/ipykernel_22400/556953189.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_entities.head(10), x='Count', y='Entity Type', palette="magma")
No description has been provided for this image
In [27]:
 
Named Entity counts:
Entity Type Count
4 ORG 429
5 GPE 205
3 PERSON 197
2 DATE 145
1 CARDINAL 114
6 LAW 67
8 WORK_OF_ART 56
0 ORDINAL 33
7 NORP 31
14 PRODUCT 18
15 LOC 16
9 MONEY 14
13 EVENT 8
12 FAC 7
16 TIME 6
11 PERCENT 5
10 QUANTITY 4
17 LANGUAGE 1
/var/folders/7j/rv3w77nj6kb6kw_ssltcqpkr0000gp/T/ipykernel_22400/556953189.py:24: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_entities.head(10), x='Count', y='Entity Type', palette="magma")
No description has been provided for this image
In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


def get_cleaned_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker   = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    
    start_idx = raw_text.find(start_marker)
    if start_idx != -1:
        text = raw_text[start_idx + len(start_marker):]
    else:
        text = raw_text

    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]
    
    # Remove extra whitespace and return
    return text.strip()


corpus = []
doc_names = []  # Keep track of file names (optional)
for filename in all_files:
    file_path = os.path.join(folder, filename)
    doc = get_cleaned_text(file_path)
    corpus.append(doc)
    doc_names.append(filename)

print(f"Collected {len(corpus)} documents.")


vectorizer = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
dtm = vectorizer.fit_transform(corpus)
print("DTM shape:", dtm.shape)


n_topics = 5
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(dtm)


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic #{topic_idx}: {' '.join(top_words)}")

n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
print("\nTop words per topic:")
print_top_words(lda_model, feature_names, n_top_words)
Collected 2475 documents.
DTM shape: (2475, 526829)

Top words per topic:
Topic #0: water small time great form species large long work used
Topic #1: said man time great men did like day good little
Topic #2: la et le les il que en des qui est
Topic #3: die der en que la und el den se los
Topic #4: est 000 km years na total male female population rate
In [30]:
!pip install networkx
Defaulting to user installation because normal site-packages is not writeable
Collecting networkx
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Downloading networkx-3.2.1-py3-none-any.whl (1.6 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 5.2 MB/s eta 0:00:00a 0:00:01
Installing collected packages: networkx
Successfully installed networkx-3.2.1
In [41]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np


window_size = 5  # Sliding window size
co_occurrence = {}


for i, token in enumerate(tokens):
    for j in range(i+1, min(i+window_size, len(tokens))):
        pair = tuple(sorted([token, tokens[j]]))
        co_occurrence[pair] = co_occurrence.get(pair, 0) + 1


G = nx.Graph()
threshold = 5
for pair, weight in co_occurrence.items():
    if weight >= threshold:
        G.add_edge(pair[0], pair[1], weight=weight)

print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")


deg_centrality = nx.degree_centrality(G)
degrees = dict(G.degree())
node_color = [deg_centrality[node] for node in G.nodes()]
node_size = [degrees[node] * 100 for node in G.nodes()]
edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
edge_width = [w / 2 for w in edge_weights]


fig, ax = plt.subplots(figsize=(15, 15))
pos = nx.spring_layout(G, k=0.15, seed=42)


nodes = nx.draw_networkx_nodes(
    G, pos, ax=ax,
    node_size=node_size,
    node_color=node_color,
    cmap=cm.viridis,
    alpha=0.9
)


sm = plt.cm.ScalarMappable(cmap=cm.viridis, norm=plt.Normalize(vmin=min(node_color), vmax=max(node_color)))
sm.set_array(np.array(node_color))
fig.colorbar(sm, ax=ax, label="Degree Centrality")


edges = nx.draw_networkx_edges(
    G, pos, ax=ax,
    width=edge_width,
    edge_color=edge_weights,
    edge_cmap=cm.plasma,
    alpha=0.7
)


ecolor = np.array(edge_weights)
sm2 = plt.cm.ScalarMappable(cmap=cm.plasma, norm=plt.Normalize(vmin=min(ecolor), vmax=max(ecolor)))
sm2.set_array(ecolor)
fig.colorbar(sm2, ax=ax, label="Edge Weight (Co-occurrence)")


centrality_values = np.array(list(deg_centrality.values()))
threshold_label = np.percentile(centrality_values, 90)
high_central_nodes = {node: node for node in G.nodes() if deg_centrality[node] >= threshold_label}
nx.draw_networkx_labels(G, pos, labels=high_central_nodes, font_size=10, font_color='black', ax=ax)

ax.set_title("Informative Word Co-occurrence Network")
ax.axis("off")
plt.tight_layout()
plt.show()
Graph has 14 nodes and 17 edges.
No description has been provided for this image
In [42]:
import nltk

nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentences = nltk.sent_tokenize(cleaned_text)


sia = SentimentIntensityAnalyzer()

sentiment_scores = [sia.polarity_scores(sentence)['compound'] for sentence in sentences]


print("Number of sentences:", len(sentences))
print("Average compound sentiment score:", sum(sentiment_scores)/len(sentiment_scores))


plt.figure(figsize=(10, 6))
sns.histplot(sentiment_scores, bins=30, kde=True, color='coral')
plt.title("Distribution of Compound Sentiment Scores Across Sentences")
plt.xlabel("Compound Sentiment Score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
plt.plot(sentiment_scores, color='blue', alpha=0.6)
plt.title("Sentiment (Compound Score) Over the Course of the Text")
plt.xlabel("Sentence Index")
plt.ylabel("Compound Sentiment Score")
plt.tight_layout()
plt.show()
Number of sentences: 833
Average compound sentiment score: 0.08476578631452589
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
No description has been provided for this image
No description has been provided for this image
In [43]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import os
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


nltk.download('punkt')
nltk.download('vader_lexicon')

def get_cleaned_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    start_idx = raw_text.find(start_marker)
    if start_idx != -1:
        text = raw_text[start_idx + len(start_marker):]
    else:
        text = raw_text
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]
    return text.strip()


def compute_file_metrics(file_path):
    text = get_cleaned_text(file_path)
    # Word-level analysis:
    tokens = nltk.word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]
    num_tokens = len(tokens)
    num_unique = len(set(tokens))
    lexical_diversity = num_unique / num_tokens if num_tokens > 0 else 0
    
    # Sentence-level analysis:
    sentences = nltk.sent_tokenize(text)
    num_sentences = len(sentences)
    sentence_lengths = [len(nltk.word_tokenize(s)) for s in sentences]
    avg_sentence_length = sum(sentence_lengths)/num_sentences if num_sentences > 0 else 0
    
    # Sentiment analysis using VADER:
    sia = SentimentIntensityAnalyzer()
    sentiment_scores = [sia.polarity_scores(s)['compound'] for s in sentences]
    avg_sentiment = sum(sentiment_scores)/num_sentences if num_sentences > 0 else 0
    
    return {
        'file': os.path.basename(file_path),
        'num_tokens': num_tokens,
        'num_unique': num_unique,
        'lexical_diversity': lexical_diversity,
        'num_sentences': num_sentences,
        'avg_sentence_length': avg_sentence_length,
        'avg_sentiment': avg_sentiment
    }


folder = "Gutenberg_Books"
all_files = [f for f in os.listdir(folder) if f.endswith(".txt.txt")]


metrics_list = []
for filename in all_files:
    file_path = os.path.join(folder, filename)
    try:
        metrics = compute_file_metrics(file_path)
        metrics_list.append(metrics)
    except Exception as e:
        print(f"Error processing {filename}: {e}")


df_metrics = pd.DataFrame(metrics_list)
print("Per-file Metrics:")
display(df_metrics)


sns.set(style="whitegrid", context="talk")
plt.figure(figsize=(8,6))
sns.scatterplot(data=df_metrics, x='avg_sentence_length', y='lexical_diversity', 
                hue='avg_sentiment', palette='coolwarm', s=100)
plt.title("Lexical Diversity vs. Average Sentence Length")
plt.xlabel("Average Sentence Length (words)")
plt.ylabel("Lexical Diversity (Unique / Total Tokens)")
plt.legend(title='Avg Sentiment', bbox_to_anchor=(1.05, 1), loc=2)
plt.tight_layout()
plt.show()


plt.figure(figsize=(8,6))
sns.scatterplot(data=df_metrics, x='num_sentences', y='num_tokens', 
                hue='lexical_diversity', palette='viridis', s=100)
plt.title("Number of Sentences vs. Total Tokens")
plt.xlabel("Number of Sentences")
plt.ylabel("Total Tokens")
plt.legend(title='Lexical Diversity', bbox_to_anchor=(1.05, 1), loc=2)
plt.tight_layout()
plt.show()
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
Per-file Metrics:
file num_tokens num_unique lexical_diversity num_sentences avg_sentence_length avg_sentiment
0 4658.txt.txt 161293 15819 0.098076 8086 26.728172 0.040154
1 37009.txt.txt 77551 4991 0.064358 3295 27.069803 0.119815
2 14609.txt.txt 89507 11561 0.129163 4501 25.416130 0.013161
3 5342.txt.txt 87301 6824 0.078166 6239 17.282898 0.054297
4 17.txt.txt 268340 5539 0.020642 7676 40.448801 0.062850
... ... ... ... ... ... ... ...
2470 55836.txt.txt 57012 7572 0.132814 2200 31.482727 0.023423
2471 1452.txt.txt 132776 10999 0.082839 5061 30.670223 0.032598
2472 10061.txt.txt 3575 1081 0.302378 143 31.643357 -0.012249
2473 8395.txt.txt 18416 2276 0.123588 626 34.236422 0.035615
2474 31011.txt.txt 462 230 0.497835 33 18.515152 0.074618

2475 rows × 7 columns

No description has been provided for this image
No description has been provided for this image
In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os


corpus = []
doc_names = []  

for filename in all_files:
    file_path = os.path.join(folder, filename)
    
    text = get_cleaned_text(file_path)
    corpus.append(text)
    doc_names.append(filename)

print(f"Collected {len(corpus)} documents.")
    

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
tfidf_matrix = vectorizer.fit_transform(corpus)
print("TF-IDF matrix shape:", tfidf_matrix.shape)

k = 5  
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(tfidf_matrix)
    

pca = PCA(n_components=2, random_state=42)
tfidf_pca = pca.fit_transform(tfidf_matrix.toarray())

df_plot = pd.DataFrame({
    'PC1': tfidf_pca[:, 0],
    'PC2': tfidf_pca[:, 1],
    'Cluster': clusters,
    'Document': doc_names
})

plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_plot, x='PC1', y='PC2', hue='Cluster', palette='tab10', s=100, legend='full')
plt.title("Document Clustering of Gutenberg Books (PCA Visualization)")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.legend(title="Cluster", bbox_to_anchor=(1.05, 1), loc=2)
plt.tight_layout()
plt.show()

for i in range(k):
    docs_in_cluster = df_plot[df_plot['Cluster'] == i]['Document'].tolist()
    print(f"Cluster {i} ({len(docs_in_cluster)} documents): {docs_in_cluster}")
Collected 2475 documents.
TF-IDF matrix shape: (2475, 526829)
No description has been provided for this image
Cluster 0 (475 documents): ['5342.txt.txt', '24737.txt.txt', '36020.txt.txt', '14407.txt.txt', '7849.txt.txt', '23393.txt.txt', '24852.txt.txt', '507.txt.txt', '53372.txt.txt', '389.txt.txt', '21736.txt.txt', '6927.txt.txt', '2392.txt.txt', '28650.txt.txt', '215.txt.txt', '982.txt.txt', '173.txt.txt', '15767.txt.txt', '2770.txt.txt', '53386.txt.txt', '24025.txt.txt', '13602.txt.txt', '12116.txt.txt', '8492.txt.txt', '5817.txt.txt', '8565.txt.txt', '28572.txt.txt', '137.txt.txt', '807.txt.txt', '23672.txt.txt', '18937.txt.txt', '4633.txt.txt', '17412.txt.txt', '13646.txt.txt', '4014.txt.txt', '8914.txt.txt', '21316.txt.txt', '2756.txt.txt', '145.txt.txt', '17811.txt.txt', '155.txt.txt', '2833.txt.txt', '6879.txt.txt', '1947.txt.txt', '223.txt.txt', '17396.txt.txt', '17362.txt.txt', '2350.txt.txt', '43.txt.txt', '10607.txt.txt', '108.txt.txt', '316.txt.txt', '1872.txt.txt', '98.txt.txt', '9479.txt.txt', '16251.txt.txt', '11171.txt.txt', '35993.txt.txt', '2735.txt.txt', '24108.txt.txt', '21656.txt.txt', '7841.txt.txt', '2488.txt.txt', '12225.txt.txt', '1897.txt.txt', '13647.txt.txt', '1300.txt.txt', '34165.txt.txt', '19533.txt.txt', '42.txt.txt', '12545.txt.txt', '28693.txt.txt', '25887.txt.txt', '5061.txt.txt', '9903.txt.txt', '1257.txt.txt', '23756.txt.txt', '599.txt.txt', '14833.txt.txt', '520.txt.txt', '24875.txt.txt', '1188.txt.txt', '646.txt.txt', '11592.txt.txt', '21948.txt.txt', '21078.txt.txt', '5343.txt.txt', '16.txt.txt', '4731.txt.txt', '3777.txt.txt', '9925.txt.txt', '2892.txt.txt', '10007.txt.txt', '6133.txt.txt', '7439.txt.txt', '28198.txt.txt', '12573.txt.txt', '398.txt.txt', '564.txt.txt', '24821.txt.txt', '19673.txt.txt', '2609.txt.txt', '1563.txt.txt', '2407.txt.txt', '14893.txt.txt', '204.txt.txt', '58866.txt.txt', '2393.txt.txt', '172.txt.txt', '24389.txt.txt', '2097.txt.txt', '30003.txt.txt', '16718.txt.txt', '2465.txt.txt', '1354.txt.txt', '24858.txt.txt', '10586.txt.txt', '146.txt.txt', '30576.txt.txt', '5348.txt.txt', '4017.txt.txt', '7205.txt.txt', '289.txt.txt', '2441.txt.txt', '15213.txt.txt', '10743.txt.txt', '2852.txt.txt', '31856.txt.txt', '17860.txt.txt', '28617.txt.txt', '58820.txt.txt', '4699.txt.txt', '6984.txt.txt', '4715.txt.txt', '2057.txt.txt', '31963.txt.txt', '644.txt.txt', '24877.txt.txt', '22285.txt.txt', '27190.txt.txt', '9963.txt.txt', '3457.txt.txt', '1472.txt.txt', '24618.txt.txt', '31219.txt.txt', '10554.txt.txt', '14668.txt.txt', '10886.txt.txt', '22014.txt.txt', '13499.txt.txt', '768.txt.txt', '514.txt.txt', '1825.txt.txt', '12297.txt.txt', '20606.txt.txt', '917.txt.txt', '16264.txt.txt', '19389.txt.txt', '5341.txt.txt', '32069.txt.txt', '23922.txt.txt', '559.txt.txt', '421.txt.txt', '19726.txt.txt', '50661.txt.txt', '696.txt.txt', '3536.txt.txt', '5747.txt.txt', '832.txt.txt', '3005.txt.txt', '14060.txt.txt', '23625.txt.txt', '4357.txt.txt', '24350.txt.txt', '2126.txt.txt', '17824.txt.txt', '7875.txt.txt', '2391.txt.txt', '1183.txt.txt', '22925.txt.txt', '53370.txt.txt', '14874.txt.txt', '5322.txt.txt', '974.txt.txt', '10066.txt.txt', '37215.txt.txt', '2070.txt.txt', '36022.txt.txt', '11668.txt.txt', '22544.txt.txt', '5340.txt.txt', '31547.txt.txt', '21687.txt.txt', '14375.txt.txt', '499.txt.txt', '2233.txt.txt', '16663.txt.txt', '17379.txt.txt', '51854.txt.txt', '420.txt.txt', '558.txt.txt', '113.txt.txt', '31861.txt.txt', '5697.txt.txt', '12680.txt.txt', '19755.txt.txt', '8178.txt.txt', '15387.txt.txt', '1182.txt.txt', '583.txt.txt', '9909.txt.txt', '21322.txt.txt', '7144.txt.txt', '3268.txt.txt', '18485.txt.txt', '24644.txt.txt', '7204.txt.txt', '13921.txt.txt', '4792.txt.txt', '157.txt.txt', '770.txt.txt', '61262.txt.txt', '57323.txt.txt', '7423.txt.txt', '22659.txt.txt', '12122.txt.txt', '25783.txt.txt', '24876.txt.txt', '533.txt.txt', '20547.txt.txt', '6985.txt.txt', '794.txt.txt', '3424.txt.txt', '21359.txt.txt', '12630.txt.txt', '37329.txt.txt', '51.txt.txt', '5304.txt.txt', '6753.txt.txt', '17782.txt.txt', '53356.txt.txt', '8164.txt.txt', '24873.txt.txt', '8649.txt.txt', '18253.txt.txt', '1251.txt.txt', '781.txt.txt', '2408.txt.txt', '25449.txt.txt', '6980.txt.txt', '44.txt.txt', '5311.txt.txt', '1155.txt.txt', '54.txt.txt', '2347.txt.txt', '31516.txt.txt', '12108.txt.txt', '9977.txt.txt', '2098.txt.txt', '15281.txt.txt', '24811.txt.txt', '544.txt.txt', '21279.txt.txt', '2834.txt.txt', '707.txt.txt', '7847.txt.txt', '471.txt.txt', '16957.txt.txt', '120.txt.txt', '24887.txt.txt', '21728.txt.txt', '8190.txt.txt', '12267.txt.txt', '2226.txt.txt', '11758.txt.txt', '11620.txt.txt', '1661.txt.txt', '4047.txt.txt', '836.txt.txt', '2028.txt.txt', '1342.txt.txt', '9806.txt.txt', '18505.txt.txt', '308.txt.txt', '1399.txt.txt', '174.txt.txt', '5164.txt.txt', '24022.txt.txt', '18019.txt.txt', '3289.txt.txt', '6382.txt.txt', '23462.txt.txt', '21721.txt.txt', '18833.txt.txt', '14766.txt.txt', '13648.txt.txt', '13213.txt.txt', '1685.txt.txt', '6877.txt.txt', '271.txt.txt', '32664.txt.txt', '3533.txt.txt', '837.txt.txt', '7028.txt.txt', '22761.txt.txt', '1259.txt.txt', '23065.txt.txt', '14075.txt.txt', '446.txt.txt', '12590.txt.txt', '203.txt.txt', '175.txt.txt', '17959.txt.txt', '11255.txt.txt', '1441.txt.txt', '7065.txt.txt', '14028.txt.txt', '73.txt.txt', '6124.txt.txt', '11.txt.txt', '3760.txt.txt', '2557.txt.txt', '20613.txt.txt', '902.txt.txt', '2759.txt.txt', '23569.txt.txt', '9932.txt.txt', '2607.txt.txt', '2042.txt.txt', '22566.txt.txt', '5780.txt.txt', '2183.txt.txt', '3499.txt.txt', '5946.txt.txt', '20796.txt.txt', '1095.txt.txt', '1874.txt.txt', '24968.txt.txt', '7335.txt.txt', '13058.txt.txt', '55.txt.txt', '6112.txt.txt', '45.txt.txt', '2020.txt.txt', '10601.txt.txt', '706.txt.txt', '13650.txt.txt', '2273.txt.txt', '33407.txt.txt', '33619.txt.txt', '21300.txt.txt', '863.txt.txt', '22160.txt.txt', '30667.txt.txt', '6768.txt.txt', '5396.txt.txt', '47338.txt.txt', '2857.txt.txt', '121.txt.txt', '34465.txt.txt', '7471.txt.txt', '19826.txt.txt', '53416.txt.txt', '30486.txt.txt', '996.txt.txt', '22816.txt.txt', '85.txt.txt', '1965.txt.txt', '1184.txt.txt', '95.txt.txt', '3796.txt.txt', '18343.txt.txt', '17161.txt.txt', '24560.txt.txt', '1239.txt.txt', '8486.txt.txt', '14640.txt.txt', '2225.txt.txt', '47696.txt.txt', '19596.txt.txt', '19337.txt.txt', '15839.txt.txt', '910.txt.txt', '21986.txt.txt', '2641.txt.txt', '7896.txt.txt', '34063.txt.txt', '15787.txt.txt', '19033.txt.txt', '13690.txt.txt', '419.txt.txt', '6684.txt.txt', '47.txt.txt', '486.txt.txt', '20366.txt.txt', '1703.txt.txt', '20533.txt.txt', '17314.txt.txt', '16255.txt.txt', '1252.txt.txt', '10914.txt.txt', '30691.txt.txt', '6852.txt.txt', '78.txt.txt', '49131.txt.txt', '38777.txt.txt', '19142.txt.txt', '766.txt.txt', '13937.txt.txt', '41562.txt.txt', '6343.txt.txt', '5141.txt.txt', '4376.txt.txt', '53489.txt.txt', '2752.txt.txt', '20532.txt.txt', '22517.txt.txt', '46.txt.txt', '1298.txt.txt', '20785.txt.txt', '22976.txt.txt', '209.txt.txt', '8176.txt.txt', '6440.txt.txt', '4358.txt.txt', '4230.txt.txt', '2788.txt.txt', '17866.txt.txt', '19706.txt.txt', '51233.txt.txt', '19460.txt.txt', '244.txt.txt', '20348.txt.txt', '1429.txt.txt', '37363.txt.txt', '4532.txt.txt', '6342.txt.txt', '1952.txt.txt', '21699.txt.txt', '12288.txt.txt', '236.txt.txt', '18891.txt.txt', '1480.txt.txt', '24878.txt.txt', '3829.txt.txt', '5200.txt.txt', '584.txt.txt', '25452.txt.txt', '21053.txt.txt', '20757.txt.txt', '31619.txt.txt', '53299.txt.txt', '834.txt.txt', '11544.txt.txt', '24571.txt.txt', '21031.txt.txt', '1036.txt.txt', '1026.txt.txt', '20840.txt.txt', '21286.txt.txt', '25823.txt.txt', '12.txt.txt', '2554.txt.txt', '61168.txt.txt', '502.txt.txt', '22492.txt.txt', '159.txt.txt', '20194.txt.txt', '2781.txt.txt', '560.txt.txt', '5325.txt.txt', '4747.txt.txt', '1980.txt.txt', '60.txt.txt', '2005.txt.txt']
Cluster 1 (509 documents): ['17.txt.txt', '21111.txt.txt', '4693.txt.txt', '2848.txt.txt', '21528.txt.txt', '23475.txt.txt', '29839.txt.txt', '3704.txt.txt', '1995.txt.txt', '4912.txt.txt', '28536.txt.txt', '2760.txt.txt', '11366.txt.txt', '17309.txt.txt', '7242.txt.txt', '11637.txt.txt', '1666.txt.txt', '23169.txt.txt', '20203.txt.txt', '19850.txt.txt', '2147.txt.txt', '831.txt.txt', '12753.txt.txt', '24388.txt.txt', '18674.txt.txt', '17587.txt.txt', '12058.txt.txt', '4300.txt.txt', '1015.txt.txt', '13003.txt.txt', '34844.txt.txt', '13797.txt.txt', '3177.txt.txt', '6104.txt.txt', '1152.txt.txt', '31469.txt.txt', '5891.txt.txt', '5625.txt.txt', '940.txt.txt', '18767.txt.txt', '34294.txt.txt', '10148.txt.txt', '14431.txt.txt', '13711.txt.txt', '16772.txt.txt', '5449.txt.txt', '17310.txt.txt', '2981.txt.txt', '11323.txt.txt', '6052.txt.txt', '1004.txt.txt', '3031.txt.txt', '7499.txt.txt', '14916.txt.txt', '3296.txt.txt', '2850.txt.txt', '711.txt.txt', '12873.txt.txt', '19608.txt.txt', '1484.txt.txt', '1494.txt.txt', '38233.txt.txt', '13796.txt.txt', '46521.txt.txt', '16941.txt.txt', '15263.txt.txt', '30249.txt.txt', '22089.txt.txt', '29056.txt.txt', '37814.txt.txt', '5307.txt.txt', '11112.txt.txt', '13127.txt.txt', '18909.txt.txt', '15675.txt.txt', '13241.txt.txt', '11431.txt.txt', '22675.txt.txt', '2610.txt.txt', '7026.txt.txt', '2600.txt.txt', '2323.txt.txt', '22409.txt.txt', '42622.txt.txt', '6585.txt.txt', '23499.txt.txt', '21381.txt.txt', '353.txt.txt', '1837.txt.txt', '21899.txt.txt', '11348.txt.txt', '37206.txt.txt', '2882.txt.txt', '7999.txt.txt', '18857.txt.txt', '1597.txt.txt', '9886.txt.txt', '10150.txt.txt', '14314.txt.txt', '5529.txt.txt', '18193.txt.txt', '6782.txt.txt', '441.txt.txt', '529.txt.txt', '4294.txt.txt', '5266.txt.txt', '18910.txt.txt', '14366.txt.txt', '11438.txt.txt', '830.txt.txt', '23533.txt.txt', '18297.txt.txt', '3534.txt.txt', '37787.txt.txt', '348.txt.txt', '22693.txt.txt', '10985.txt.txt', '22117.txt.txt', '3327.txt.txt', '1312.txt.txt', '2276.txt.txt', '7440.txt.txt', '1653.txt.txt', '6615.txt.txt', '22456.txt.txt', '53368.txt.txt', '124.txt.txt', '18362.txt.txt', '3284.txt.txt', '26196.txt.txt', '6818.txt.txt', '12384.txt.txt', '4028.txt.txt', '20748.txt.txt', '25016.txt.txt', '5998.txt.txt', '15040.txt.txt', '169.txt.txt', '18247.txt.txt', '654.txt.txt', '20955.txt.txt', '5921.txt.txt', '2388.txt.txt', '14591.txt.txt', '34607.txt.txt', '1039.txt.txt', '3721.txt.txt', '5305.txt.txt', '12561.txt.txt', '13113.txt.txt', '6764.txt.txt', '16160.txt.txt', '14466.txt.txt', '30230.txt.txt', '54337.txt.txt', '7838.txt.txt', '23875.txt.txt', '341.txt.txt', '3810.txt.txt', '8220.txt.txt', '14241.txt.txt', '8299.txt.txt', '2874.txt.txt', '16672.txt.txt', '12426.txt.txt', '8795.txt.txt', '33434.txt.txt', '5160.txt.txt', '13291.txt.txt', '36844.txt.txt', '82.txt.txt', '443.txt.txt', '12595.txt.txt', '22396.txt.txt', '29349.txt.txt', '7951.txt.txt', '36970.txt.txt', '601.txt.txt', '37959.txt.txt', '18721.txt.txt', '16939.txt.txt', '6130.txt.txt', '22373.txt.txt', '13102.txt.txt', '25325.txt.txt', '11772.txt.txt', '10897.txt.txt', '41085.txt.txt', '16452.txt.txt', '964.txt.txt', '30221.txt.txt', '27887.txt.txt', '11241.txt.txt', '13559.txt.txt', '2881.txt.txt', '14328.txt.txt', '22886.txt.txt', '30760.txt.txt', '15.txt.txt', '228.txt.txt', '16697.txt.txt', '42324.txt.txt', '28497.txt.txt', '2145.txt.txt', '6886.txt.txt', '19529.txt.txt', '22083.txt.txt', '18500.txt.txt', '19068.txt.txt', '7864.txt.txt', '6791.txt.txt', '207.txt.txt', '37904.txt.txt', '39674.txt.txt', '14880.txt.txt', '7477.txt.txt', '3623.txt.txt', '17766.txt.txt', '5157.txt.txt', '2744.txt.txt', '12473.txt.txt', '3050.txt.txt', '19240.txt.txt', '25918.txt.txt', '10845.txt.txt', '135.txt.txt', '31847.txt.txt', '2853.txt.txt', '26046.txt.txt', '7889.txt.txt', '11565.txt.txt', '36131.txt.txt', '15659.txt.txt', '15164.txt.txt', '1371.txt.txt', '11330.txt.txt', '19061.txt.txt', '3837.txt.txt', '6788.txt.txt', '13097.txt.txt', '7353.txt.txt', '30103.txt.txt', '23.txt.txt', '3434.txt.txt', '20885.txt.txt', '3261.txt.txt', '7025.txt.txt', '12758.txt.txt', '11277.txt.txt', '22500.txt.txt', '25546.txt.txt', '17208.txt.txt', '23580.txt.txt', '21189.txt.txt', '19312.txt.txt', '7099.txt.txt', '10662.txt.txt', '36.txt.txt', '3725.txt.txt', '37122.txt.txt', '12543.txt.txt', '11741.txt.txt', '3160.txt.txt', '20656.txt.txt', '2149.txt.txt', '12060.txt.txt', '8743.txt.txt', '6687.txt.txt', '22420.txt.txt', '34736.txt.txt', '2680.txt.txt', '15697.txt.txt', '12380.txt.txt', '14297.txt.txt', '2846.txt.txt', '11408.txt.txt', '13268.txt.txt', '31162.txt.txt', '36124.txt.txt', '2166.txt.txt', '1002.txt.txt', '14020.txt.txt', '22217.txt.txt', '11894.txt.txt', '12422.txt.txt', '7245.txt.txt', '17162.txt.txt', '14360.txt.txt', '38827.txt.txt', '2150.txt.txt', '12888.txt.txt', '12898.txt.txt', '7766.txt.txt', '22096.txt.txt', '45619.txt.txt', '22382.txt.txt', '24902.txt.txt', '86.txt.txt', '33687.txt.txt', '12982.txt.txt', '21614.txt.txt', '6168.txt.txt', '139.txt.txt', '15794.txt.txt', '7885.txt.txt', '19900.txt.txt', '16985.txt.txt', '62.txt.txt', '19287.txt.txt', '8800.txt.txt', '6702.txt.txt', '3761.txt.txt', '23245.txt.txt', '903.txt.txt', '345.txt.txt', '11870.txt.txt', '1831.txt.txt', '21692.txt.txt', '4389.txt.txt', '29246.txt.txt', '9380.txt.txt', '7401.txt.txt', '35554.txt.txt', '2151.txt.txt', '7777.txt.txt', '4330.txt.txt', '17163.txt.txt', '21138.txt.txt', '2400.txt.txt', '24676.txt.txt', '456.txt.txt', '213.txt.txt', '17034.txt.txt', '14226.txt.txt', '808.txt.txt', '17024.txt.txt', '1727.txt.txt', '8801.txt.txt', '12259.txt.txt', '16984.txt.txt', '11123.txt.txt', '18274.txt.txt', '33965.txt.txt', '677.txt.txt', '6087.txt.txt', '10376.txt.txt', '45858.txt.txt', '13202.txt.txt', '11462.txt.txt', '2064.txt.txt', '25718.txt.txt', '14411.txt.txt', '10026.txt.txt', '11369.txt.txt', '1250.txt.txt', '16131.txt.txt', '8789.txt.txt', '8799.txt.txt', '18134.txt.txt', '12797.txt.txt', '4380.txt.txt', '41445.txt.txt', '11740.txt.txt', '2503.txt.txt', '23997.txt.txt', '14445.txt.txt', '2148.txt.txt', '4339.txt.txt', '1467.txt.txt', '14368.txt.txt', '1493.txt.txt', '5225.txt.txt', '13015.txt.txt', '2750.txt.txt', '873.txt.txt', '26346.txt.txt', '35345.txt.txt', '31111.txt.txt', '25763.txt.txt', '1365.txt.txt', '30511.txt.txt', '16537.txt.txt', '5402.txt.txt', '23031.txt.txt', '1003.txt.txt', '10430.txt.txt', '19226.txt.txt', '131.txt.txt', '16042.txt.txt', '35534.txt.txt', '2131.txt.txt', '21324.txt.txt', '12928.txt.txt', '39341.txt.txt', '23495.txt.txt', '22381.txt.txt', '24869.txt.txt', '10380.txt.txt', '4974.txt.txt', '19721.txt.txt', '17851.txt.txt', '20851.txt.txt', '6317.txt.txt', '11310.txt.txt', '11887.txt.txt', '3657.txt.txt', '13053.txt.txt', '23650.txt.txt', '11863.txt.txt', '503.txt.txt', '22483.txt.txt', '148.txt.txt', '10002.txt.txt', '60908.txt.txt', '12351.txt.txt', '17567.txt.txt', '12333.txt.txt', '16986.txt.txt', '1008.txt.txt', '2527.txt.txt', '3155.txt.txt', '972.txt.txt', '10070.txt.txt', '25600.txt.txt', '7947.txt.txt', '18214.txt.txt', '10838.txt.txt', '8578.txt.txt', '7190.txt.txt', '944.txt.txt', '13579.txt.txt', '25428.txt.txt', '3726.txt.txt', '15343.txt.txt', '3163.txt.txt', '2563.txt.txt', '35.txt.txt', '10671.txt.txt', '792.txt.txt', '3422.txt.txt', '12169.txt.txt', '1001.txt.txt', '1998.txt.txt', '12242.txt.txt', '17221.txt.txt', '6574.txt.txt', '38767.txt.txt', '12814.txt.txt', '22694.txt.txt', '17770.txt.txt', '14572.txt.txt', '31103.txt.txt', '699.txt.txt', '32518.txt.txt', '689.txt.txt', '6422.txt.txt', '7128.txt.txt', '16140.txt.txt', '15399.txt.txt', '37872.txt.txt', '28019.txt.txt', '21254.txt.txt', '13944.txt.txt', '32934.txt.txt', '36405.txt.txt', '2743.txt.txt', '32300.txt.txt', '22101.txt.txt', '26500.txt.txt', '7446.txt.txt', '13643.txt.txt', '18828.txt.txt', '8159.txt.txt', '10842.txt.txt', '23494.txt.txt', '2199.txt.txt', '2800.txt.txt', '6932.txt.txt', '8642.txt.txt', '84.txt.txt', '21973.txt.txt', '13665.txt.txt', '23545.txt.txt', '3317.txt.txt', '20102.txt.txt', '15145.txt.txt', '680.txt.txt', '13334.txt.txt', '7386.txt.txt', '41537.txt.txt', '1906.txt.txt', '44896.txt.txt', '10806.txt.txt', '20778.txt.txt', '4018.txt.txt', '16751.txt.txt', '512.txt.txt', '25281.txt.txt', '27933.txt.txt', '15950.txt.txt', '9882.txt.txt', '20907.txt.txt', '45542.txt.txt', '15932.txt.txt', '11387.txt.txt', '36564.txt.txt', '11246.txt.txt', '14460.txt.txt', '24263.txt.txt']
Cluster 2 (823 documents): ['4658.txt.txt', '37009.txt.txt', '25023.txt.txt', '22065.txt.txt', '15620.txt.txt', '22748.txt.txt', '22620.txt.txt', '34781.txt.txt', '20191.txt.txt', '8700.txt.txt', '18751.txt.txt', '14814.txt.txt', '38516.txt.txt', '26867.txt.txt', '19514.txt.txt', '37823.txt.txt', '3462.txt.txt', '34554.txt.txt', '18846.txt.txt', '17354.txt.txt', '33502.txt.txt', '30155.txt.txt', '38398.txt.txt', '12327.txt.txt', '19274.txt.txt', '61142.txt.txt', '34093.txt.txt', '33629.txt.txt', '24506.txt.txt', '5173.txt.txt', '35448.txt.txt', '25731.txt.txt', '37350.txt.txt', '17742.txt.txt', '23626.txt.txt', '35013.txt.txt', '6630.txt.txt', '28957.txt.txt', '35542.txt.txt', '29816.txt.txt', '15091.txt.txt', '17855.txt.txt', '20587.txt.txt', '277.txt.txt', '35829.txt.txt', '30293.txt.txt', '13493.txt.txt', '2939.txt.txt', '20153.txt.txt', '2009.txt.txt', '21531.txt.txt', '31175.txt.txt', '27638.txt.txt', '14600.txt.txt', '42198.txt.txt', '2746.txt.txt', '10986.txt.txt', '24448.txt.txt', '33767.txt.txt', '5710.txt.txt', '20774.txt.txt', '16543.txt.txt', '22114.txt.txt', '18556.txt.txt', '24923.txt.txt', '20386.txt.txt', '18884.txt.txt', '13923.txt.txt', '24787.txt.txt', '26113.txt.txt', '23991.txt.txt', '26558.txt.txt', '19921.txt.txt', '34604.txt.txt', '20924.txt.txt', '46773.txt.txt', '19998.txt.txt', '35830.txt.txt', '35842.txt.txt', '19550.txt.txt', '8215.txt.txt', '39471.txt.txt', '38003.txt.txt', '27868.txt.txt', '34903.txt.txt', '38013.txt.txt', '31050.txt.txt', '27675.txt.txt', '19953.txt.txt', '24222.txt.txt', '11734.txt.txt', '30754.txt.txt', '32376.txt.txt', '26323.txt.txt', '12774.txt.txt', '33029.txt.txt', '10751.txt.txt', '3674.txt.txt', '22379.txt.txt', '23673.txt.txt', '43375.txt.txt', '38404.txt.txt', '10834.txt.txt', '2938.txt.txt', '18525.txt.txt', '39372.txt.txt', '9457.txt.txt', '19406.txt.txt', '11662.txt.txt', '26672.txt.txt', '27378.txt.txt', '58008.txt.txt', '1887.txt.txt', '16119.txt.txt', '30321.txt.txt', '1268.txt.txt', '34175.txt.txt', '48007.txt.txt', '34984.txt.txt', '33766.txt.txt', '24449.txt.txt', '11498.txt.txt', '28434.txt.txt', '18237.txt.txt', '12956.txt.txt', '8423.txt.txt', '35744.txt.txt', '34326.txt.txt', '38482.txt.txt', '14990.txt.txt', '27560.txt.txt', '39396.txt.txt', '15665.txt.txt', '15020.txt.txt', '14558.txt.txt', '35450.txt.txt', '34501.txt.txt', '34479.txt.txt', '6986.txt.txt', '375.txt.txt', '8172.txt.txt', '15193.txt.txt', '27911.txt.txt', '51547.txt.txt', '39235.txt.txt', '9666.txt.txt', '33972.txt.txt', '9650.txt.txt', '19769.txt.txt', '23770.txt.txt', '7010.txt.txt', '27748.txt.txt', '16441.txt.txt', '34076.txt.txt', '14474.txt.txt', '15535.txt.txt', '13111.txt.txt', '33287.txt.txt', '28764.txt.txt', '28402.txt.txt', '26393.txt.txt', '53373.txt.txt', '28247.txt.txt', '29635.txt.txt', '22766.txt.txt', '2124.txt.txt', '20116.txt.txt', '12648.txt.txt', '18183.txt.txt', '1615.txt.txt', '19499.txt.txt', '3620.txt.txt', '29233.txt.txt', '32974.txt.txt', '38440.txt.txt', '19420.txt.txt', '19275.txt.txt', '19103.txt.txt', '18334.txt.txt', '20788.txt.txt', '41533.txt.txt', '28897.txt.txt', '42649.txt.txt', '12261.txt.txt', '15237.txt.txt', '43282.txt.txt', '35070.txt.txt', '18900.txt.txt', '37566.txt.txt', '32505.txt.txt', '13575.txt.txt', '34131.txt.txt', '38959.txt.txt', '24258.txt.txt', '19251.txt.txt', '34585.txt.txt', '32950.txt.txt', '29019.txt.txt', '22107.txt.txt', '17132.txt.txt', '4524.txt.txt', '6344.txt.txt', '31751.txt.txt', '19380.txt.txt', '34211.txt.txt', '14056.txt.txt', '60281.txt.txt', '20411.txt.txt', '33508.txt.txt', '24637.txt.txt', '35601.txt.txt', '18790.txt.txt', '4748.txt.txt', '24072.txt.txt', '5761.txt.txt', '31987.txt.txt', '19145.txt.txt', '14293.txt.txt', '16921.txt.txt', '22.txt.txt', '31570.txt.txt', '40818.txt.txt', '20556.txt.txt', '27676.txt.txt', '11204.txt.txt', '17748.txt.txt', '39472.txt.txt', '8018.txt.txt', '14959.txt.txt', '32817.txt.txt', '11684.txt.txt', '8102.txt.txt', '19449.txt.txt', '30850.txt.txt', '19364.txt.txt', '3788.txt.txt', '8559.txt.txt', '53347.txt.txt', '20927.txt.txt', '16220.txt.txt', '31674.txt.txt', '19786.txt.txt', '31149.txt.txt', '49119.txt.txt', '22344.txt.txt', '33527.txt.txt', '17209.txt.txt', '28681.txt.txt', '38480.txt.txt', '13618.txt.txt', '32982.txt.txt', '31534.txt.txt', '20159.txt.txt', '17149.txt.txt', '22790.txt.txt', '34954.txt.txt', '48836.txt.txt', '13489.txt.txt', '18203.txt.txt', '37842.txt.txt', '39996.txt.txt', '36903.txt.txt', '17192.txt.txt', '24583.txt.txt', '37529.txt.txt', '26477.txt.txt', '31630.txt.txt', '17606.txt.txt', '17408.txt.txt', '14.txt.txt', '35157.txt.txt', '8866.txt.txt', '4973.txt.txt', '9411.txt.txt', '274.txt.txt', '24964.txt.txt', '35490.txt.txt', '20846.txt.txt', '33648.txt.txt', '17987.txt.txt', '38687.txt.txt', '18285.txt.txt', '35062.txt.txt', '34568.txt.txt', '14012.txt.txt', '15460.txt.txt', '19591.txt.txt', '17455.txt.txt', '2232.txt.txt', '30001.txt.txt', '14625.txt.txt', '26656.txt.txt', '14070.txt.txt', '30626.txt.txt', '7223.txt.txt', '27509.txt.txt', '29086.txt.txt', '30550.txt.txt', '11365.txt.txt', '21566.txt.txt', '22764.txt.txt', '7700.txt.txt', '38207.txt.txt', '6934.txt.txt', '18050.txt.txt', '8001.txt.txt', '216.txt.txt', '14838.txt.txt', '25973.txt.txt', '17910.txt.txt', '28569.txt.txt', '779.txt.txt', '19506.txt.txt', '1176.txt.txt', '37101.txt.txt', '20346.txt.txt', '6.txt.txt', '21724.txt.txt', '20665.txt.txt', '13274.txt.txt', '22728.txt.txt', '33129.txt.txt', '30243.txt.txt', '31621.txt.txt', '15622.txt.txt', '13008.txt.txt', '38290.txt.txt', '2306.txt.txt', '34371.txt.txt', '17571.txt.txt', '36285.txt.txt', '35097.txt.txt', '32958.txt.txt', '19564.txt.txt', '25286.txt.txt', '7787.txt.txt', '46981.txt.txt', '31141.txt.txt', '33721.txt.txt', '12629.txt.txt', '33659.txt.txt', '15707.txt.txt', '30523.txt.txt', '48.txt.txt', '30000.txt.txt', '34579.txt.txt', '8952.txt.txt', '27386.txt.txt', '548.txt.txt', '4962.txt.txt', '21916.txt.txt', '19737.txt.txt', '103.txt.txt', '26598.txt.txt', '22397.txt.txt', '19423.txt.txt', '83.txt.txt', '11647.txt.txt', '18971.txt.txt', '20426.txt.txt', '17740.txt.txt', '53384.txt.txt', '9090.txt.txt', '20750.txt.txt', '4346.txt.txt', '30541.txt.txt', '2888.txt.txt', '23259.txt.txt', '6478.txt.txt', '20776.txt.txt', '25140.txt.txt', '15435.txt.txt', '22260.txt.txt', '15270.txt.txt', '51058.txt.txt', '24931.txt.txt', '19250.txt.txt', '23186.txt.txt', '18544.txt.txt', '1487.txt.txt', '54460.txt.txt', '28570.txt.txt', '34834.txt.txt', '4759.txt.txt', '24063.txt.txt', '1642.txt.txt', '34437.txt.txt', '16523.txt.txt', '31732.txt.txt', '8452.txt.txt', '34098.txt.txt', '168.txt.txt', '36924.txt.txt', '19634.txt.txt', '10912.txt.txt', '23755.txt.txt', '31334.txt.txt', '40819.txt.txt', '24156.txt.txt', '15139.txt.txt', '5000.txt.txt', '12017.txt.txt', '41958.txt.txt', '55084.txt.txt', '23433.txt.txt', '24407.txt.txt', '31513.txt.txt', '41.txt.txt', '14993.txt.txt', '15284.txt.txt', '38315.txt.txt', '38428.txt.txt', '23691.txt.txt', '7297.txt.txt', '50133.txt.txt', '19885.txt.txt', '20839.txt.txt', '17829.txt.txt', '3421.txt.txt', '26378.txt.txt', '20317.txt.txt', '16130.txt.txt', '31574.txt.txt', '3008.txt.txt', '27600.txt.txt', '27778.txt.txt', '31035.txt.txt', '18298.txt.txt', '32472.txt.txt', '13402.txt.txt', '1233.txt.txt', '38956.txt.txt', '5067.txt.txt', '23585.txt.txt', '9612.txt.txt', '53343.txt.txt', '27975.txt.txt', '279.txt.txt', '62579.txt.txt', '8106.txt.txt', '1881.txt.txt', '20763.txt.txt', '31830.txt.txt', '10840.txt.txt', '24790.txt.txt', '15489.txt.txt', '519.txt.txt', '18637.txt.txt', '21781.txt.txt', '14868.txt.txt', '19029.txt.txt', '34949.txt.txt', '5765.txt.txt', '22009.txt.txt', '22990.txt.txt', '34101.txt.txt', '19090.txt.txt', '6322.txt.txt', '11335.txt.txt', '32653.txt.txt', '15207.txt.txt', '3672.txt.txt', '27559.txt.txt', '30666.txt.txt', '20467.txt.txt', '35128.txt.txt', '17451.txt.txt', '15464.txt.txt', '25267.txt.txt', '14006.txt.txt', '14370.txt.txt', '14218.txt.txt', '15147.txt.txt', '28553.txt.txt', '19722.txt.txt', '18350.txt.txt', '19444.txt.txt', '53669.txt.txt', '22288.txt.txt', '32962.txt.txt', '18779.txt.txt', '37901.txt.txt', '19115.txt.txt', '34094.txt.txt', '164.txt.txt', '38658.txt.txt', '53381.txt.txt', '25992.txt.txt', '27713.txt.txt', '37512.txt.txt', '31240.txt.txt', '27348.txt.txt', '11385.txt.txt', '18452.txt.txt', '5180.txt.txt', '22784.txt.txt', '180.txt.txt', '56482.txt.txt', '31458.txt.txt', '59417.txt.txt', '13117.txt.txt', '18929.txt.txt', '34449.txt.txt', '24730.txt.txt', '30775.txt.txt', '36504.txt.txt', '14776.txt.txt', '16116.txt.txt', '14400.txt.txt', '33044.txt.txt', '31624.txt.txt', '32426.txt.txt', '2884.txt.txt', '48991.txt.txt', '4204.txt.txt', '10011.txt.txt', '45532.txt.txt', '38032.txt.txt', '28233.txt.txt', '19042.txt.txt', '39205.txt.txt', '31951.txt.txt', '28466.txt.txt', '26908.txt.txt', '29362.txt.txt', '28710.txt.txt', '8142.txt.txt', '37856.txt.txt', '12293.txt.txt', '24855.txt.txt', '16295.txt.txt', '15097.txt.txt', '2871.txt.txt', '19723.txt.txt', '38308.txt.txt', '19368.txt.txt', '7254.txt.txt', '32159.txt.txt', '5424.txt.txt', '19856.txt.txt', '48769.txt.txt', '4248.txt.txt', '25983.txt.txt', '22771.txt.txt', '30310.txt.txt', '30066.txt.txt', '25529.txt.txt', '18184.txt.txt', '19262.txt.txt', '87.txt.txt', '994.txt.txt', '18333.txt.txt', '14473.txt.txt', '2016.txt.txt', '35413.txt.txt', '34634.txt.txt', '5605.txt.txt', '33504.txt.txt', '18928.txt.txt', '2.txt.txt', '59416.txt.txt', '14091.txt.txt', '18735.txt.txt', '19031.txt.txt', '53364.txt.txt', '18206.txt.txt', '37595.txt.txt', '34787.txt.txt', '19606.txt.txt', '17966.txt.txt', '19053.txt.txt', '29728.txt.txt', '15491.txt.txt', '59298.txt.txt', '17575.txt.txt', '33566.txt.txt', '36036.txt.txt', '22636.txt.txt', '25646.txt.txt', '22600.txt.txt', '36645.txt.txt', '9914.txt.txt', '27238.txt.txt', '34353.txt.txt', '37151.txt.txt', '12787.txt.txt', '35596.txt.txt', '38077.txt.txt', '31293.txt.txt', '19209.txt.txt', '14987.txt.txt', '37776.txt.txt', '18866.txt.txt', '34259.txt.txt', '2030.txt.txt', '25874.txt.txt', '30181.txt.txt', '13791.txt.txt', '20390.txt.txt', '49080.txt.txt', '4657.txt.txt', '13640.txt.txt', '3332.txt.txt', '34737.txt.txt', '34110.txt.txt', '17700.txt.txt', '38189.txt.txt', '30677.txt.txt', '18931.txt.txt', '27558.txt.txt', '24077.txt.txt', '14664.txt.txt', '16780.txt.txt', '24485.txt.txt', '19715.txt.txt', '12443.txt.txt', '26014.txt.txt', '7234.txt.txt', '17275.txt.txt', '5201.txt.txt', '20113.txt.txt', '25735.txt.txt', '1323.txt.txt', '13347.txt.txt', '19116.txt.txt', '33830.txt.txt', '10084.txt.txt', '35937.txt.txt', '11924.txt.txt', '19270.txt.txt', '16972.txt.txt', '29122.txt.txt', '8547.txt.txt', '33852.txt.txt', '13325.txt.txt', '31147.txt.txt', '27137.txt.txt', '20019.txt.txt', '1027.txt.txt', '7413.txt.txt', '14015.txt.txt', '26076.txt.txt', '1662.txt.txt', '21020.txt.txt', '31221.txt.txt', '7256.txt.txt', '3807.txt.txt', '45647.txt.txt', '20195.txt.txt', '33967.txt.txt', '22829.txt.txt', '14403.txt.txt', '2066.txt.txt', '19180.txt.txt', '20769.txt.txt', '53480.txt.txt', '23403.txt.txt', '33574.txt.txt', '34532.txt.txt', '49032.txt.txt', '3772.txt.txt', '1112.txt.txt', '61.txt.txt', '34550.txt.txt', '18852.txt.txt', '37742.txt.txt', '20663.txt.txt', '13272.txt.txt', '19913.txt.txt', '15003.txt.txt', '33915.txt.txt', '39421.txt.txt', '571.txt.txt', '16847.txt.txt', '14872.txt.txt', '38418.txt.txt', '22282.txt.txt', '20848.txt.txt', '48807.txt.txt', '25063.txt.txt', '22035.txt.txt', '25550.txt.txt', '27517.txt.txt', '25.txt.txt', '3754.txt.txt', '34672.txt.txt', '926.txt.txt', '16410.txt.txt', '33941.txt.txt', '28216.txt.txt', '36922.txt.txt', '29031.txt.txt', '8177.txt.txt', '14826.txt.txt', '32189.txt.txt', '28018.txt.txt', '19279.txt.txt', '17474.txt.txt', '53929.txt.txt', '11615.txt.txt', '23666.txt.txt', '16360.txt.txt', '48070.txt.txt', '16370.txt.txt', '18458.txt.txt', '8419.txt.txt', '33874.txt.txt', '18013.txt.txt', '46303.txt.txt', '17382.txt.txt', '32947.txt.txt', '34582.txt.txt', '39988.txt.txt', '12299.txt.txt', '21688.txt.txt', '10843.txt.txt', '11344.txt.txt', '31756.txt.txt', '10726.txt.txt', '15100.txt.txt', '2742.txt.txt', '60088.txt.txt', '53499.txt.txt', '17135.txt.txt', '13007.txt.txt', '31558.txt.txt', '40820.txt.txt', '34044.txt.txt', '24776.txt.txt', '15352.txt.txt', '21081.txt.txt', '21918.txt.txt', '18155.txt.txt', '16081.txt.txt', '27977.txt.txt', '19729.txt.txt', '18223.txt.txt', '22914.txt.txt', '19651.txt.txt', '18251.txt.txt', '14837.txt.txt', '8997.txt.txt', '19494.txt.txt', '33543.txt.txt', '11649.txt.txt', '12655.txt.txt', '23434.txt.txt', '26457.txt.txt', '23319.txt.txt', '12815.txt.txt', '17289.txt.txt', '20299.txt.txt', '21007.txt.txt', '37632.txt.txt', '12406.txt.txt', '3708.txt.txt', '21534.txt.txt', '29691.txt.txt', '1533.txt.txt', '2753.txt.txt', '20771.txt.txt', '17124.txt.txt', '30560.txt.txt', '36669.txt.txt', '15888.txt.txt', '15069.txt.txt', '35158.txt.txt', '48530.txt.txt', '4011.txt.txt', '29444.txt.txt', '16955.txt.txt', '10852.txt.txt', '15831.txt.txt', '8502.txt.txt', '1974.txt.txt', '19271.txt.txt', '4907.txt.txt', '19424.txt.txt', '40817.txt.txt', '23066.txt.txt', '35006.txt.txt', '25990.txt.txt', '3307.txt.txt', '15137.txt.txt', '9097.txt.txt', '1228.txt.txt', '10773.txt.txt', '17170.txt.txt', '10136.txt.txt', '24409.txt.txt', '32032.txt.txt', '3003.txt.txt', '4778.txt.txt', '32677.txt.txt', '13962.txt.txt', '37809.txt.txt', '2872.txt.txt', '7188.txt.txt', '36830.txt.txt', '7014.txt.txt', '12238.txt.txt', '34376.txt.txt', '12350.txt.txt', '34523.txt.txt', '13177.txt.txt', '6875.txt.txt', '38356.txt.txt', '2485.txt.txt', '14969.txt.txt', '19138.txt.txt', '34848.txt.txt', '33966.txt.txt', '33914.txt.txt', '5192.txt.txt', '19354.txt.txt', '20239.txt.txt', '6329.txt.txt', '9943.txt.txt', '22657.txt.txt', '29688.txt.txt', '16378.txt.txt', '8395.txt.txt', '31011.txt.txt']
Cluster 3 (147 documents): ['14609.txt.txt', '54020.txt.txt', '320.txt.txt', '18723.txt.txt', '2000.txt.txt', '26608.txt.txt', '5258.txt.txt', '60656.txt.txt', '43389.txt.txt', '17203.txt.txt', '23654.txt.txt', '42648.txt.txt', '13951.txt.txt', '18783.txt.txt', '37989.txt.txt', '32315.txt.txt', '42131.txt.txt', '56327.txt.txt', '62406.txt.txt', '49168.txt.txt', '17707.txt.txt', '5126.txt.txt', '41322.txt.txt', '4791.txt.txt', '42765.txt.txt', '32298.txt.txt', '4968.txt.txt', '28827.txt.txt', '4717.txt.txt', '4649.txt.txt', '46111.txt.txt', '21282.txt.txt', '17798.txt.txt', '5097.txt.txt', '2820.txt.txt', '40827.txt.txt', '20401.txt.txt', '37951.txt.txt', '43761.txt.txt', '45468.txt.txt', '62405.txt.txt', '44958.txt.txt', '57687.txt.txt', '35103.txt.txt', '39201.txt.txt', '15066.txt.txt', '22268.txt.txt', '10775.txt.txt', '62196.txt.txt', '24515.txt.txt', '27278.txt.txt', '18798.txt.txt', '13216.txt.txt', '17419.txt.txt', '53749.txt.txt', '34783.txt.txt', '60882.txt.txt', '17691.txt.txt', '54873.txt.txt', '20394.txt.txt', '27752.txt.txt', '13703.txt.txt', '62404.txt.txt', '35498.txt.txt', '42525.txt.txt', '57648.txt.txt', '19496.txt.txt', '36315.txt.txt', '17013.txt.txt', '19643.txt.txt', '18157.txt.txt', '15113.txt.txt', '59037.txt.txt', '24924.txt.txt', '14287.txt.txt', '27757.txt.txt', '18784.txt.txt', '14158.txt.txt', '11748.txt.txt', '49437.txt.txt', '10814.txt.txt', '20852.txt.txt', '26699.txt.txt', '59859.txt.txt', '39101.txt.txt', '60198.txt.txt', '16826.txt.txt', '39331.txt.txt', '11302.txt.txt', '26370.txt.txt', '799.txt.txt', '16827.txt.txt', '31881.txt.txt', '16059.txt.txt', '2419.txt.txt', '39328.txt.txt', '25317.txt.txt', '18289.txt.txt', '10841.txt.txt', '24536.txt.txt', '18921.txt.txt', '13622.txt.txt', '58706.txt.txt', '19106.txt.txt', '44300.txt.txt', '58801.txt.txt', '62615.txt.txt', '13846.txt.txt', '52123.txt.txt', '38674.txt.txt', '55855.txt.txt', '14765.txt.txt', '16105.txt.txt', '4548.txt.txt', '53523.txt.txt', '35802.txt.txt', '15353.txt.txt', '49887.txt.txt', '18864.txt.txt', '1619.txt.txt', '19234.txt.txt', '26818.txt.txt', '51338.txt.txt', '34008.txt.txt', '12230.txt.txt', '16885.txt.txt', '24766.txt.txt', '14799.txt.txt', '12533.txt.txt', '17073.txt.txt', '35444.txt.txt', '55554.txt.txt', '45590.txt.txt', '28281.txt.txt', '13525.txt.txt', '13792.txt.txt', '52894.txt.txt', '15127.txt.txt', '25756.txt.txt', '43901.txt.txt', '49836.txt.txt', '57547.txt.txt', '36708.txt.txt', '48529.txt.txt', '53540.txt.txt', '55836.txt.txt', '10061.txt.txt']
Cluster 4 (521 documents): ['13172.txt.txt', '23609.txt.txt', '2439.txt.txt', '17326.txt.txt', '7001.txt.txt', '11464.txt.txt', '15699.txt.txt', '15955.txt.txt', '16831.txt.txt', '14866.txt.txt', '6812.txt.txt', '53527.txt.txt', '16528.txt.txt', '8390.txt.txt', '3029.txt.txt', '2376.txt.txt', '1961.txt.txt', '16399.txt.txt', '23428.txt.txt', '4511.txt.txt', '24708.txt.txt', '14988.txt.txt', '17213.txt.txt', '13612.txt.txt', '2157.txt.txt', '13249.txt.txt', '1468.txt.txt', '754.txt.txt', '2096.txt.txt', '26490.txt.txt', '17845.txt.txt', '11952.txt.txt', '13941.txt.txt', '24951.txt.txt', '16598.txt.txt', '16531.txt.txt', '4278.txt.txt', '2442.txt.txt', '19289.txt.txt', '14672.txt.txt', '10444.txt.txt', '15262.txt.txt', '13656.txt.txt', '4362.txt.txt', '17774.txt.txt', '17188.txt.txt', '14975.txt.txt', '31511.txt.txt', '20014.txt.txt', '4386.txt.txt', '36496.txt.txt', '449.txt.txt', '28659.txt.txt', '17897.txt.txt', '14264.txt.txt', '20897.txt.txt', '22606.txt.txt', '4716.txt.txt', '10114.txt.txt', '25049.txt.txt', '13635.txt.txt', '27262.txt.txt', '17461.txt.txt', '26600.txt.txt', '2443.txt.txt', '14291.txt.txt', '13482.txt.txt', '2832.txt.txt', '28677.txt.txt', '19243.txt.txt', '477.txt.txt', '3043.txt.txt', '45846.txt.txt', '30047.txt.txt', '12235.txt.txt', '31425.txt.txt', '7452.txt.txt', '14577.txt.txt', '4363.txt.txt', '23593.txt.txt', '624.txt.txt', '11102.txt.txt', '16467.txt.txt', '22607.txt.txt', '24700.txt.txt', '30755.txt.txt', '26981.txt.txt', '15561.txt.txt', '6167.txt.txt', '99.txt.txt', '448.txt.txt', '60736.txt.txt', '12902.txt.txt', '18273.txt.txt', '19322.txt.txt', '24726.txt.txt', '17327.txt.txt', '22557.txt.txt', '14760.txt.txt', '1271.txt.txt', '22631.txt.txt', '6462.txt.txt', '22885.txt.txt', '10065.txt.txt', '20666.txt.txt', '20023.txt.txt', '5.txt.txt', '5321.txt.txt', '16983.txt.txt', '13888.txt.txt', '13831.txt.txt', '45303.txt.txt', '28257.txt.txt', '2849.txt.txt', '16331.txt.txt', '22776.txt.txt', '17753.txt.txt', '852.txt.txt', '15776.txt.txt', '26377.txt.txt', '15255.txt.txt', '47730.txt.txt', '15803.txt.txt', '983.txt.txt', '5694.txt.txt', '11953.txt.txt', '19217.txt.txt', '28066.txt.txt', '13613.txt.txt', '17212.txt.txt', '13056.txt.txt', '31234.txt.txt', '6312.txt.txt', '28020.txt.txt', '14976.txt.txt', '13858.txt.txt', '13376.txt.txt', '37530.txt.txt', '4361.txt.txt', '1302.txt.txt', '28297.txt.txt', '17579.txt.txt', '13893.txt.txt', '12400.txt.txt', '2529.txt.txt', '6605.txt.txt', '24062.txt.txt', '25568.txt.txt', '2162.txt.txt', '20715.txt.txt', '22994.txt.txt', '9104.txt.txt', '7959.txt.txt', '37839.txt.txt', '3743.txt.txt', '4705.txt.txt', '4908.txt.txt', '8112.txt.txt', '9408.txt.txt', '636.txt.txt', '626.txt.txt', '22677.txt.txt', '15479.txt.txt', '40780.txt.txt', '18845.txt.txt', '19719.txt.txt', '21840.txt.txt', '28039.txt.txt', '3800.txt.txt', '4210.txt.txt', '10738.txt.txt', '3775.txt.txt', '11951.txt.txt', '8438.txt.txt', '22716.txt.txt', '20987.txt.txt', '22153.txt.txt', '59.txt.txt', '14657.txt.txt', '13611.txt.txt', '22460.txt.txt', '22257.txt.txt', '1561.txt.txt', '22131.txt.txt', '24505.txt.txt', '3279.txt.txt', '16965.txt.txt', '20910.txt.txt', '53360.txt.txt', '17287.txt.txt', '47192.txt.txt', '2449.txt.txt', '10633.txt.txt', '17607.txt.txt', '17324.txt.txt', '4732.txt.txt', '16350.txt.txt', '13610.txt.txt', '13600.txt.txt', '5430.txt.txt', '28075.txt.txt', '1911.txt.txt', '20847.txt.txt', '19172.txt.txt', '3300.txt.txt', '3310.txt.txt', '15106.txt.txt', '11352.txt.txt', '35514.txt.txt', '24654.txt.txt', '24001.txt.txt', '47091.txt.txt', '14977.txt.txt', '34856.txt.txt', '45456.txt.txt', '18269.txt.txt', '1497.txt.txt', '27827.txt.txt', '8567.txt.txt', '47132.txt.txt', '59573.txt.txt', '25254.txt.txt', '35333.txt.txt', '34901.txt.txt', '19317.txt.txt', '30756.txt.txt', '2330.txt.txt', '11538.txt.txt', '5637.txt.txt', '13407.txt.txt', '16702.txt.txt', '15478.txt.txt', '22345.txt.txt', '14825.txt.txt', '6841.txt.txt', '47449.txt.txt', '14426.txt.txt', '11378.txt.txt', '24518.txt.txt', '1404.txt.txt', '27341.txt.txt', '11114.txt.txt', '15663.txt.txt', '12625.txt.txt', '25828.txt.txt', '21077.txt.txt', '22430.txt.txt', '52819.txt.txt', '31078.txt.txt', '34853.txt.txt', '36866.txt.txt', '19400.txt.txt', '36299.txt.txt', '24780.txt.txt', '19237.txt.txt', '28056.txt.txt', '28613.txt.txt', '50535.txt.txt', '14900.txt.txt', '130.txt.txt', '13300.txt.txt', '11560.txt.txt', '2176.txt.txt', '16363.txt.txt', '18794.txt.txt', '31529.txt.txt', '2445.txt.txt', '16728.txt.txt', '4602.txt.txt', '18281.txt.txt', '26556.txt.txt', '20842.txt.txt', '19211.txt.txt', '11955.txt.txt', '20580.txt.txt', '11689.txt.txt', '202.txt.txt', '22608.txt.txt', '1320.txt.txt', '4035.txt.txt', '14064.txt.txt', '14080.txt.txt', '20660.txt.txt', '22010.txt.txt', '2017.txt.txt', '14472.txt.txt', '24107.txt.txt', '22523.txt.txt', '12565.txt.txt', '17321.txt.txt', '7370.txt.txt', '12342.txt.txt', '35470.txt.txt', '12661.txt.txt', '20967.txt.txt', '27785.txt.txt', '15480.txt.txt', '28668.txt.txt', '18113.txt.txt', '28678.txt.txt', '60235.txt.txt', '17388.txt.txt', '13789.txt.txt', '5827.txt.txt', '41017.txt.txt', '11954.txt.txt', '19200.txt.txt', '19594.txt.txt', '12423.txt.txt', '13614.txt.txt', '21677.txt.txt', '19846.txt.txt', '14209.txt.txt', '4352.txt.txt', '7140.txt.txt', '13468.txt.txt', '15134.txt.txt', '21615.txt.txt', '18194.txt.txt', '97.txt.txt', '16960.txt.txt', '28458.txt.txt', '984.txt.txt', '27867.txt.txt', '19098.txt.txt', '4069.txt.txt', '19296.txt.txt', '28126.txt.txt', '23860.txt.txt', '20293.txt.txt', '15921.txt.txt', '36974.txt.txt', '24586.txt.txt', '8508.txt.txt', '18757.txt.txt', '47708.txt.txt', '1694.txt.txt', '25282.txt.txt', '19560.txt.txt', '28669.txt.txt', '11029.txt.txt', '33333.txt.txt', '7524.txt.txt', '20248.txt.txt', '10366.txt.txt', '10000.txt.txt', '3253.txt.txt', '19192.txt.txt', '24461.txt.txt', '24519.txt.txt', '19699.txt.txt', '17306.txt.txt', '8855.txt.txt', '22323.txt.txt', '23680.txt.txt', '2900.txt.txt', '2099.txt.txt', '47109.txt.txt', '18087.txt.txt', '20521.txt.txt', '5268.txt.txt', '31671.txt.txt', '1232.txt.txt', '10611.txt.txt', '14378.txt.txt', '3207.txt.txt', '22832.txt.txt', '22822.txt.txt', '16494.txt.txt', '9173.txt.txt', '1951.txt.txt', '470.txt.txt', '26030.txt.txt', '3567.txt.txt', '20137.txt.txt', '1656.txt.txt', '14674.txt.txt', '29270.txt.txt', '2847.txt.txt', '3291.txt.txt', '15250.txt.txt', '22136.txt.txt', '18564.txt.txt', '201.txt.txt', '6933.txt.txt', '444.txt.txt', '11956.txt.txt', '21900.txt.txt', '5681.txt.txt', '28540.txt.txt', '16287.txt.txt', '2873.txt.txt', '19164.txt.txt', '17009.txt.txt', '26278.txt.txt', '10251.txt.txt', '15483.txt.txt', '18755.txt.txt', '4583.txt.txt', '17611.txt.txt', '10657.txt.txt', '12027.txt.txt', '1549.txt.txt', '7015.txt.txt', '8909.txt.txt', '17322.txt.txt', '11716.txt.txt', '17332.txt.txt', '22542.txt.txt', '16996.txt.txt', '19284.txt.txt', '6763.txt.txt', '51292.txt.txt', '21201.txt.txt', '14461.txt.txt', '27118.txt.txt', '42238.txt.txt', '5183.txt.txt', '20906.txt.txt', '11198.txt.txt', '18993.txt.txt', '26198.txt.txt', '8115.txt.txt', '18477.txt.txt', '15718.txt.txt', '16462.txt.txt', '14721.txt.txt', '24777.txt.txt', '26117.txt.txt', '20523.txt.txt', '22259.txt.txt', '3069.txt.txt', '20439.txt.txt', '2040.txt.txt', '10661.txt.txt', '2808.txt.txt', '360.txt.txt', '23692.txt.txt', '5255.txt.txt', '23033.txt.txt', '16653.txt.txt', '3034.txt.txt', '16643.txt.txt', '13065.txt.txt', '26912.txt.txt', '47703.txt.txt', '28672.txt.txt', '18880.txt.txt', '5039.txt.txt', '9841.txt.txt', '10163.txt.txt', '8910.txt.txt', '17253.txt.txt', '13642.txt.txt', '13568.txt.txt', '22352.txt.txt', '6888.txt.txt', '10574.txt.txt', '21091.txt.txt', '15293.txt.txt', '3580.txt.txt', '7960.txt.txt', '9662.txt.txt', '18127.txt.txt', '13029.txt.txt', '53453.txt.txt', '17087.txt.txt', '28601.txt.txt', '41479.txt.txt', '6603.txt.txt', '2447.txt.txt', '18932.txt.txt', '1989.txt.txt', '20220.txt.txt', '35409.txt.txt', '2995.txt.txt', '2649.txt.txt', '5767.txt.txt', '16534.txt.txt', '16546.txt.txt', '4367.txt.txt', '21630.txt.txt', '10580.txt.txt', '28026.txt.txt', '28673.txt.txt', '4280.txt.txt', '4290.txt.txt', '200.txt.txt', '11015.txt.txt', '10477.txt.txt', '4341.txt.txt', '4351.txt.txt', '14555.txt.txt', '12019.txt.txt', '2130.txt.txt', '22700.txt.txt', '14004.txt.txt', '10338.txt.txt', '27250.txt.txt', '38750.txt.txt', '22108.txt.txt', '13444.txt.txt', '11224.txt.txt', '1265.txt.txt', '21427.txt.txt', '54905.txt.txt', '13722.txt.txt', '17323.txt.txt', '30802.txt.txt', '18440.txt.txt', '26978.txt.txt', '24681.txt.txt', '17280.txt.txt', '418.txt.txt', '18843.txt.txt', '6762.txt.txt', '7300.txt.txt', '2526.txt.txt', '19285.txt.txt', '1.txt.txt', '1452.txt.txt']
In [45]:
import math
from wordcloud import WordCloud


cluster_texts = {i: "" for i in range(k)}
for text, cluster in zip(corpus, clusters):
    cluster_texts[cluster] += " " + text


cols = 2
rows = math.ceil(k / cols)

fig, axs = plt.subplots(rows, cols, figsize=(cols * 6, rows * 4))
axs = axs.flatten()  

for i in range(k):
    
    wc = WordCloud(stopwords='english', background_color='white',
                   max_words=100, width=400, height=300)
    wc.generate(cluster_texts[i])
    axs[i].imshow(wc, interpolation='bilinear')
    axs[i].axis("off")
    axs[i].set_title(f"Cluster {i}")


for j in range(k, len(axs)):
    fig.delaxes(axs[j])

plt.suptitle("Word Clouds for Each Document Cluster", fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()
No description has been provided for this image
In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

print("Topic Modeling for each Cluster:")


for cluster_id, text in cluster_texts.items():
    print(f"\n--- Cluster {cluster_id} ---")
    
    
    paragraphs = [para.strip() for para in text.split("\n\n") if len(para.split()) > 20]
    
    if len(paragraphs) < 5:
        print("Not enough paragraphs for robust topic modeling. Skipping this cluster.")
        continue
    
    
    vectorizer_cluster = CountVectorizer(stop_words='english', max_df=0.95, min_df=2)
    dtm_cluster = vectorizer_cluster.fit_transform(paragraphs)
    
   
    n_topics_cluster = 2
    lda_cluster = LatentDirichletAllocation(n_components=n_topics_cluster, random_state=42)
    lda_cluster.fit(dtm_cluster)
    
    feature_names = vectorizer_cluster.get_feature_names_out()

    for topic_idx, topic in enumerate(lda_cluster.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        print(f"Topic {topic_idx}: {', '.join(top_words)}")
Topic Modeling for each Cluster:

--- Cluster 0 ---
Topic 0: die, der, und, den, zu, von, das, des, dem, sie
Topic 1: said, man, little, like, time, did, know, mr, good, old

--- Cluster 1 ---
Topic 0: thou, said, god, shall, thy, man, thee, king, did, lord
Topic 1: time, little, great, man, day, like, old, men, long, came

--- Cluster 2 ---
Topic 0: est, 000, party, years, male, female, 00, 15, km, president
Topic 1: di, time, water, great, en, little, la, like, work, small

--- Cluster 3 ---
Topic 0: et, la, le, les, que, il, en, des, qui, est
Topic 1: que, la, el, en, se, los, por, las, su, del

--- Cluster 4 ---
Topic 0: man, god, life, men, time, great, shall, good, world, things
Topic 1: great, time, war, general, king, men, new, government, country, years
In [49]:
!pip install gensim
!pip install usd-core
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: gensim in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (4.3.3)
Requirement already satisfied: numpy<2.0,>=1.18.5 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim) (1.26.4)
Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim) (1.13.1)
Requirement already satisfied: smart-open>=1.8.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim) (7.1.0)
Requirement already satisfied: wrapt in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from smart-open>=1.8.1->gensim) (1.17.2)
Defaulting to user installation because normal site-packages is not writeable
Collecting usd-core
  Downloading usd_core-25.2.post1-cp39-none-macosx_10_9_universal2.whl.metadata (1.6 kB)
Downloading usd_core-25.2.post1-cp39-none-macosx_10_9_universal2.whl (37.8 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 37.8/37.8 MB 2.4 MB/s eta 0:00:0000:0100:01
Installing collected packages: usd-core
Successfully installed usd-core-25.2.post1
In [51]:
!pip install nltk
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: nltk in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (3.9.1)
Requirement already satisfied: click in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from nltk) (8.1.8)
Requirement already satisfied: joblib in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from nltk) (1.4.2)
Requirement already satisfied: regex>=2021.8.3 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from nltk) (2024.11.6)
Requirement already satisfied: tqdm in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from nltk) (4.67.1)
In [55]:
!pip install --upgrade cython gensim
!pip install --force-reinstall gensim
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: cython in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (3.0.11)
Requirement already satisfied: gensim in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (4.3.3)
Requirement already satisfied: numpy<2.0,>=1.18.5 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim) (1.26.4)
Requirement already satisfied: scipy<1.14.0,>=1.7.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim) (1.13.1)
Requirement already satisfied: smart-open>=1.8.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim) (7.1.0)
Requirement already satisfied: wrapt in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from smart-open>=1.8.1->gensim) (1.17.2)
Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Using cached gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (8.3 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Using cached wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.4 kB)
Using cached gensim-4.3.3-cp39-cp39-macosx_11_0_arm64.whl (24.0 MB)
Using cached numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl (14.0 MB)
Using cached scipy-1.13.1-cp39-cp39-macosx_12_0_arm64.whl (30.3 MB)
Using cached smart_open-7.1.0-py3-none-any.whl (61 kB)
Using cached wrapt-1.17.2-cp39-cp39-macosx_11_0_arm64.whl (38 kB)
Installing collected packages: wrapt, numpy, smart-open, scipy, gensim
  Attempting uninstall: wrapt
    Found existing installation: wrapt 1.17.2
    Uninstalling wrapt-1.17.2:
      Successfully uninstalled wrapt-1.17.2
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
  Attempting uninstall: smart-open
    Found existing installation: smart-open 7.1.0
    Uninstalling smart-open-7.1.0:
      Successfully uninstalled smart-open-7.1.0
  Attempting uninstall: scipy
    Found existing installation: scipy 1.13.1
    Uninstalling scipy-1.13.1:
      Successfully uninstalled scipy-1.13.1
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.3
    Uninstalling gensim-4.3.3:
      Successfully uninstalled gensim-4.3.3
Successfully installed gensim-4.3.3 numpy-1.26.4 scipy-1.13.1 smart-open-7.1.0 wrapt-1.17.2
In [63]:
!pip install --upgrade pip setuptools wheel
!pip install gensim==4.3.0
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: pip in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (25.0)
Requirement already satisfied: setuptools in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (75.8.0)
Requirement already satisfied: wheel in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (0.45.1)
Defaulting to user installation because normal site-packages is not writeable
Collecting gensim==4.3.0
  Downloading gensim-4.3.0.tar.gz (23.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 23.3/23.3 MB 2.3 MB/s eta 0:00:0000:0100:01
  Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy>=1.18.5 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim==4.3.0) (1.26.4)
Requirement already satisfied: scipy>=1.7.0 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim==4.3.0) (1.13.1)
Requirement already satisfied: smart_open>=1.8.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from gensim==4.3.0) (7.1.0)
Collecting FuzzyTM>=0.4.0 (from gensim==4.3.0)
  Downloading FuzzyTM-2.0.9-py3-none-any.whl.metadata (7.9 kB)
Requirement already satisfied: pandas in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from FuzzyTM>=0.4.0->gensim==4.3.0) (2.2.3)
Collecting pyfume (from FuzzyTM>=0.4.0->gensim==4.3.0)
  Downloading pyFUME-0.3.4-py3-none-any.whl.metadata (9.7 kB)
Requirement already satisfied: wrapt in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from smart_open>=1.8.1->gensim==4.3.0) (1.17.2)
Requirement already satisfied: python-dateutil>=2.8.2 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pandas->FuzzyTM>=0.4.0->gensim==4.3.0) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pandas->FuzzyTM>=0.4.0->gensim==4.3.0) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages (from pandas->FuzzyTM>=0.4.0->gensim==4.3.0) (2024.2)
Collecting scipy>=1.7.0 (from gensim==4.3.0)
  Downloading scipy-1.10.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (53 kB)
Collecting numpy>=1.18.5 (from gensim==4.3.0)
  Downloading numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.6 kB)
Collecting simpful==2.12.0 (from pyfume->FuzzyTM>=0.4.0->gensim==4.3.0)
  Downloading simpful-2.12.0-py3-none-any.whl.metadata (4.8 kB)
Collecting fst-pso==1.8.1 (from pyfume->FuzzyTM>=0.4.0->gensim==4.3.0)
  Downloading fst-pso-1.8.1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... done
Collecting pandas (from FuzzyTM>=0.4.0->gensim==4.3.0)
  Downloading pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting miniful (from fst-pso==1.8.1->pyfume->FuzzyTM>=0.4.0->gensim==4.3.0)
  Downloading miniful-0.0.6.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... done
Requirement already satisfied: six>=1.5 in /Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/site-packages (from python-dateutil>=2.8.2->pandas->FuzzyTM>=0.4.0->gensim==4.3.0) (1.15.0)
Downloading FuzzyTM-2.0.9-py3-none-any.whl (31 kB)
Downloading pyFUME-0.3.4-py3-none-any.whl (60 kB)
Downloading numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl (13.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.9/13.9 MB 3.8 MB/s eta 0:00:00a 0:00:01
Downloading scipy-1.10.1-cp39-cp39-macosx_12_0_arm64.whl (28.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 28.9/28.9 MB 6.1 MB/s eta 0:00:0000:0100:01
Downloading pandas-1.5.3-cp39-cp39-macosx_11_0_arm64.whl (11.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.0/11.0 MB 6.7 MB/s eta 0:00:00 0:00:01
Downloading simpful-2.12.0-py3-none-any.whl (24 kB)
Building wheels for collected packages: gensim, fst-pso, miniful
  Building wheel for gensim (setup.py) ... done
  Created wheel for gensim: filename=gensim-4.3.0-cp39-cp39-macosx_10_9_universal2.whl size=24457501 sha256=bd02df765d212a4a856c85b33fe883236359b773ef620d627b39b6d70071d13c
  Stored in directory: /Users/mmadhusudan/Library/Caches/pip/wheels/f4/88/4d/7bef8c2e7a9e0bd4d8882e33aea52c9c577a1f94a362290191
  Building wheel for fst-pso (setup.py) ... done
  Created wheel for fst-pso: filename=fst_pso-1.8.1-py3-none-any.whl size=20478 sha256=10c887fe80944bd599ac995626e85d575135d5d9da3404236c859de4e213d070
  Stored in directory: /Users/mmadhusudan/Library/Caches/pip/wheels/99/66/48/d7ce0c6927f6abf167bbcdee537affc7b92c03632f78028411
  Building wheel for miniful (setup.py) ... done
  Created wheel for miniful: filename=miniful-0.0.6-py3-none-any.whl size=3554 sha256=5cb4d85ba076b077a00aa957e03bc5bc242b2de31a296ac1f513f4765789318e
  Stored in directory: /Users/mmadhusudan/Library/Caches/pip/wheels/d9/c7/71/db1d4646d963b34c530667501d3d6f34c0825eaffae2f0f2cb
Successfully built gensim fst-pso miniful
Installing collected packages: numpy, scipy, pandas, simpful, miniful, fst-pso, pyfume, FuzzyTM, gensim
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
  Attempting uninstall: scipy
    Found existing installation: scipy 1.13.1
    Uninstalling scipy-1.13.1:
      Successfully uninstalled scipy-1.13.1
  Attempting uninstall: pandas
    Found existing installation: pandas 2.2.3
    Uninstalling pandas-2.2.3:
      Successfully uninstalled pandas-2.2.3
  Attempting uninstall: gensim
    Found existing installation: gensim 4.3.3
    Uninstalling gensim-4.3.3:
      Successfully uninstalled gensim-4.3.3
Successfully installed FuzzyTM-2.0.9 fst-pso-1.8.1 gensim-4.3.0 miniful-0.0.6 numpy-1.24.4 pandas-1.5.3 pyfume-0.3.4 scipy-1.10.1 simpful-2.12.0
In [71]:
!python3 -m spacy download en_core_web_md
9442.04s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
/Users/mmadhusudan/Library/Python/3.9/lib/python/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020
  warnings.warn(
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 33.5/33.5 MB 8.8 MB/s eta 0:00:00a 0:00:01
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
✔ Download and installation successful
You can now load the package via spacy.load('en_core_web_md')
In [72]:
import spacy
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
import nltk
from collections import Counter

nltk.download('punkt')

nlp = spacy.load("en_core_web_md")


doc = nlp(cleaned_text)


tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop and token.has_vector]

freq = Counter(tokens)


most_common_tokens = [word for word, count in freq.most_common(50)]
print("Most common tokens:", most_common_tokens)


word_vectors = np.array([nlp.vocab[word].vector for word in most_common_tokens])

tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(word_vectors)

plt.figure(figsize=(10, 8))
plt.scatter(tsne_results[:, 0], tsne_results[:, 1], color='blue', alpha=0.6)
for i, word in enumerate(most_common_tokens):
    plt.annotate(word, (tsne_results[i, 0], tsne_results[i, 1]), fontsize=9, alpha=0.8)
plt.title("t-SNE Visualization of Word Vectors (via spaCy)")
plt.xlabel("t-SNE Dimension 1")
plt.ylabel("t-SNE Dimension 2")
plt.tight_layout()
plt.show()
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mmadhusudan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Most common tokens: ['shall', 'states', 'project', 'united', 'gutenberg', 'state', 'people', 'time', 'law', 'constitution', 'laws', 'congress', 'government', 'president', 'right', 'new', 'war', 'public', 'house', 'union', 'free', 'power', 'ebook', 'person', 'let', 'section', 'years', 'cases', 'world', 'form', 'case', 'god', 'office', 'peace', 'hope', 'small', 'money', 'rights', 'december', 'great', 'citizens', 'ebooks', 'print', 'powers', 'consent', 'representatives', 'senate', 'long', 'provide', 'declaration']
No description has been provided for this image
In [73]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cleaned_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        raw_text = f.read()
    # Use common markers to strip Gutenberg header and footer.
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK"
    start_idx = raw_text.find(start_marker)
    if start_idx != -1:
        text = raw_text[start_idx + len(start_marker):]
    else:
        text = raw_text
    end_idx = text.find(end_marker)
    if end_idx != -1:
        text = text[:end_idx]
    return text.strip()


folder = "Gutenberg_Books"
all_files = [f for f in os.listdir(folder) if f.endswith(".txt.txt")]


corpus = []
doc_names = []
for filename in all_files:
    file_path = os.path.join(folder, filename)
    try:
        text = get_cleaned_text(file_path)
        corpus.append(text)
        doc_names.append(filename)
    except Exception as e:
        print(f"Error processing {filename}: {e}")

print(f"Collected {len(corpus)} documents.")


vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
tfidf_matrix = vectorizer.fit_transform(corpus)
print("TF-IDF matrix shape:", tfidf_matrix.shape)


cos_sim_matrix = cosine_similarity(tfidf_matrix)


df_sim = pd.DataFrame(cos_sim_matrix, index=doc_names, columns=doc_names)

# --- Visualization: Document Similarity Heatmap ---
plt.figure(figsize=(12, 10))
sns.heatmap(df_sim, cmap='viridis', xticklabels=True, yticklabels=True)
plt.title("Document Cosine Similarity Heatmap")
plt.xlabel("Documents")
plt.ylabel("Documents")
plt.tight_layout()
plt.show()
Collected 2475 documents.
TF-IDF matrix shape: (2475, 526829)
No description has been provided for this image
In [74]:
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


all_tokens = []
for doc in corpus:
    # Tokenize each document into words, convert to lowercase, and filter out non-alphabetic tokens.
    tokens = nltk.word_tokenize(doc.lower())
    tokens = [token for token in tokens if token.isalpha()]
    all_tokens.extend(tokens)

print(f"Total tokens aggregated from the corpus: {len(all_tokens)}")


bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_tokens)


finder.apply_freq_filter(3)


scored_bigrams = finder.score_ngrams(bigram_measures.pmi)
# Convert to DataFrame for easy handling
df_bigrams = pd.DataFrame(scored_bigrams, columns=["bigram", "PMI"]).sort_values(by="PMI", ascending=False).head(20)
print("Top 20 bigrams by PMI:")
display(df_bigrams)


df_bigrams["bigram_str"] = df_bigrams["bigram"].apply(lambda x: " ".join(x))

plt.figure(figsize=(10, 6))
sns.barplot(data=df_bigrams, x="PMI", y="bigram_str", palette="Blues_d")
plt.xlabel("PMI Score")
plt.ylabel("Bigram")
plt.title("Top 20 Bigrams by PMI in the Corpus")
plt.tight_layout()
plt.show()
Total tokens aggregated from the corpus: 209085770
Top 20 bigrams by PMI:
bigram PMI
0 (abgeruehrter, kugelhopf) 26.054557
104 (khandu, wangchuk) 26.054557
132 (napao, wetikoo) 26.054557
133 (ndeh, ntumazah) 26.054557
134 (nerbia, espartafilardo) 26.054557
135 (nikica, valentic) 26.054557
136 (nurzhan, subkhanberdin) 26.054557
137 (nuzas, rocabertis) 26.054557
138 (ochthodromus, wilsonius) 26.054557
139 (odjo, tankpinon) 26.054557
140 (ojasta, allikkoon) 26.054557
141 (olaudah, equiano) 26.054557
142 (orhan, ucok) 26.054557
143 (otinielu, tausi) 26.054557
144 (oudom, khattiya) 26.054557
145 (palafoxes, nuzas) 26.054557
146 (papeis, avulsos) 26.054557
147 (pastissons, giraumous) 26.054557
148 (paucás, hórás) 26.054557
149 (paucís, annís) 26.054557
/var/folders/7j/rv3w77nj6kb6kw_ssltcqpkr0000gp/T/ipykernel_22400/2160407253.py:37: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=df_bigrams, x="PMI", y="bigram_str", palette="Blues_d")
No description has been provided for this image